# Description ------------------------------------------------------------------

### This merges the short and long endline vendor surveys for the
### TAD Impact Evaluation Project, cleans combined data, then saves the cleaned
### version as a .Rdata file

# Packages ---------------------------------------------------------------------
#if packages are not installed, they must be installed with:
# install.packages("package_name"); The name of package must be in quotes
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
library(purrr)
library(haven)
library(labelled)
library(hms)


# Convenience Functions --------------------------------------------------------
source("scripts/0_functions/functions_cleaning.R")

# Merging Long and Short -------------------------------------------------------
long <- read_dta('data/1_raw/tad_endline_market_long_nopii.dta')
short <- read_dta('data/1_raw/tad_endline_market_short_nopii.dta')

#checking to see that variable names are maintained btwn short and long surveys
names(short) %in% names(long)

#dropping names in short but not long (due to inconsistent cleaning of short)
#to be explicit about what is dropped and to eliminate unnecessary cols in vendor_end
short <- select(short, which(names(short) %in% names(long)))

#checking to see if all markets are in both versions (which should be the case)
unique(to_factor(long$market))[!(unique(to_factor(long$market)) %in% 
                                   unique(to_factor(short$market)))]
#All markets present

#add short/long identifier to surveys to tell obs apart once combined
short$long_survey <- 0
long$long_survey <- 1

#store labelled values, as they will be deleted when rowbinding
#store variable labels for long survey, which has all vars
labels_vars <- var_label(long)
#store value labels for all variables
labels_vals <- val_labels(long)

#binding two data sets together
vendor_end <- bind_rows(short, long)
head(vendor_end)

#reattach labels
var_label(vendor_end) <- labels_vars
val_labels(vendor_end) <- labels_vals

#check to make sure labels were reattached
head(vendor_end)

#checking number of observations per market
vendor_end %>% group_by(market) %>% summarise(n = n()) %>% arrange(n) %>% 
  to_factor

# Cleaning Data ----------------------------------------------------------------

###### Fixing issue with Luwerezi, which was not a sample market
#saving old market variable
vendor_end <- mutate(vendor_end, market_old = market)

#finding value of Luwerezi
val_labels(vendor_end$market)
#106

#Luwerezi is a mistake - looking at date, supervisor, and market
vendor_end %>% select(starttime, sup, market, enum) %>% filter(market == 106)
#it's clear, looking at schedule, that 10 of these should be Luviri, and 1 should 
#be Mpherembe --> #105, 112

vendor_end$market[which(vendor_end$market == 106)] <- c(rep(105, times = 8), 
                                                        112, 105, 105)

#checking to see if change was successful
vendor_end %>% select(market, market_old) %>% filter(market_old == 106)

##### Correcting seemingly incorrect market selections
#making a start_day and start_month variable
vendor_end <- vendor_end %>% mutate(start_day = day(starttime), 
                                    start_month = month(starttime))

#create logical vector to indicate day of interview doesn't match day of rest of 
#obs for that market (excl Jenda which was visited twice and enum 63, whose tablet
#was programmed incorrectly)
vendor_end <- vendor_end %>% group_by(market) %>% 
  mutate(inc_mkt = (start_day != Mode(start_day) & market != 100 & enum != 63)) %>%
  ungroup() 

#create logical vector to indicate whether supervisor is same within a market
same_sup <- vendor_end %>% group_by(market) %>% 
  mutate(same_sup = (sup == Mode(sup))) %>% 
  ungroup() %>% pull(same_sup)

#create space for logical vector that will indicate when a change was made to market
vendor_end$change_to_mkt <- NA

#running through observations
for(i in 1:nrow(vendor_end)){
  #only make changes if market is incorrect and supervisor is same
  if(vendor_end$inc_mkt[i] & same_sup[i]){
    #this code is just which() with dplyr
    cor_mkt_id <- select(vendor_end, market, sup, start_day, start_month, inc_mkt) %>%
      filter(sup == sup[i] & start_day == start_day[i] &
             start_month == start_month[i] & inc_mkt == F) %>% 
      pull(market)
    #if length of cor_mkt_id is 0, that means market cannot be matched with sup
    #and days
    if(length(cor_mkt_id) == 0){
      vendor_end$change_to_mkt[i] <- FALSE
    } else {
      vendor_end$market[i] <- Mode(cor_mkt_id)
      vendor_end$change_to_mkt[i] <- TRUE
    }
  }
}

#checking number of changes
table(vendor_end$change_to_mkt)

old_new_check <- vendor_end %>% 
  group_by(market) %>% 
  summarise(n = n()) %>% 
  mutate(market_ID = market,
         market = as.character(to_factor(market))) %>% arrange(market) %>% 
  left_join(vendor_end %>% 
              group_by(market_old) %>% 
              summarise(n_old = n()) %>% 
              mutate(market_old = as.character(to_factor(market_old))) %>%
              arrange(market_old), by = c("market" = "market_old"))

#removing unnecessary markets from labels
not_used <- with(vendor_end, 
                 val_labels(market)[!(names(val_labels(market)) %in%
                                        unique(to_factor(market)))])
vendor_end <- vendor_end %>% remove_value_labels(market = not_used)

##### Correcting Wrong District Selections
#creating district_old
vendor_end <- mutate(vendor_end, district_old = district)

#creating indicator vector for when district is different from modal district for 
#market
vendor_end <- vendor_end %>% group_by(market) %>% 
  mutate(inc_dist = (district != Mode(district))) %>%
  ungroup() 

#replacing district with mode of district
vendor_end <- vendor_end %>% group_by(market) %>%
  mutate(district = Mode(district)) %>% ungroup() 

#remove pilot label from district data
val_label(vendor_end$district_old, 9) <- NULL

#relable district
val_labels(vendor_end$district) <- val_labels(vendor_end$district_old)

##### Demographic and Economic Variables

#creating new variables from old
vendor_end <- mutate(vendor_end,
                     #dem vars
                     gender = neg_to_na(r1),
                     female = ifelse(gender == 1, 1, 0),
                     born_dist = neg_to_na(d1),
                     curr_dist = neg_to_na(d2),
                     tribe = neg_to_na(d5),
                     #also drop unrealistic values of age
                     age = neg_to_na(d8, outlier_top = 200),
                     education = ifelse(d12 == 888, 0, neg_to_na(d12) + 1),
                     reading_language = neg_to_na(d13),
                     literacy = rev_values(neg_to_na(d14)),
                     literacy_high = ifelse(literacy == 4, 1, 0),
                     literacy_any = ifelse(literacy == 3 | literacy == 4, 1, 0),
                     marital_status = neg_to_na(d6),
                     married = ifelse(marital_status == 2 | 
                                        marital_status == 3, 1, 0),
                     #econ vars
                     hh_income = neg_to_na(e1),
                     hh_income_trim_99 = neg_to_na(hh_income, 
                                                outlier_top = quantile(hh_income,
                                                                       probs = .99,
                                                                       na.rm = T)),
                     hh_income_top_99 = ifelse(hh_income > quantile(hh_income,
                                                                    probs = .99,
                                                                    na.rm = T),
                                               quantile(hh_income,
                                                        probs = .99,
                                                        na.rm = T),
                                               hh_income),
                     houses = neg_to_na(e2_a),
                     houses_b = neg_to_na(e2_a, outlier_top = 100),
                     acres_farmland = neg_to_na(e2_b),
                     acres_plots = neg_to_na(e2_c),
                     bicycles = neg_to_na(e2_d),
                     chickens = neg_to_na(e2_f),
                     goats = neg_to_na(e2_g),
                     basic_cell_phones = neg_to_na(e2_m),
                     smart_phones = neg_to_na(e2_n),
                     sell_freq = rev_values(neg_to_na(e3)),
                     sell_regular = ifelse(sell_freq > 5, 1, 0),
                     sell_daily = ifelse(sell_freq == 8, 1, 0),
                     days_pr_week = neg_to_na(e4_count, outlier_top = 7),
                     stall_activity = neg_to_na(e7_b),
                     stall_type = neg_to_na(te20),
                     service = ifelse(e7_a == 2, 1, 0),
                     prim_earner = e10_primary,
                     yrs_in_mkt = e10_years,
                     profit = neg_to_na(e10),
                     profit_trim_99 = neg_to_na(profit, 
                                                   outlier_top = quantile(profit,
                                                                          probs = .99,
                                                                          na.rm = T)),
                     profit_top_99 = ifelse(profit > quantile(profit,
                                                                    probs = .99,
                                                                    na.rm = T),
                                               quantile(profit,
                                                        probs = .99,
                                                        na.rm = T),
                                               profit),
                     customers_pr_day = neg_to_na(e11),
                     customers_pr_day_top_99 = if_else(customers_pr_day > quantile(customers_pr_day,
                                                                        probs = .99,
                                                                        na.rm = T),
                                                      quantile(customers_pr_day,
                                                               probs = .99,
                                                               na.rm = T),
                                                      customers_pr_day),
                     customers_comp_lst_yr = rev_values(neg_to_na(e12)),
                     profit_lst_yr_month = rev_values(neg_to_na(e13)),
                     profit_lst_yr_gen = rev_values(neg_to_na(e16)),
                     econ_cond_5 = rev_values(neg_to_na(e15)),
                     econ_cond_3 = ifelse(econ_cond_5 == 1 | econ_cond_5 == 2,
                                          1, ifelse(econ_cond_5 == 4 | econ_cond_5 == 5,
                                                    3, ifelse(econ_cond_5 == 3, 2, NA))))

#NA values labels that have to be removed for factor purposes
na_vec <- c(-66, -77, -88, -99)

#removing NA values for factor purposes
vendor_end <- vendor_end %>% remove_NA_labels(na_vec, start = 542)

# Rearrange labels for those vars need it
val_labels(vendor_end) <- list(education = c(None = 0, "Nursery School" = 1,
                                             "Standard 1" = 2, "Standard 2" = 3,
                                             "Standard 3" = 4, "Standard 4" = 5,
                                             "Standard 5" = 6, "Standard 6" = 7,
                                             "Standard 7" = 8, "Standard 8" = 9,
                                             "Form 1" = 10, "JCE/Form 2" = 11,
                                             "Form 3" = 12, "MSCE/Form 4" = 13,
                                             "Technical/Private College (non-Degree)" = 16,
                                              Degree = 17, Masters = 18, PhD = 19),
                               literacy = c("Could not read" = 1, 
                                            "Could read some of the card" = 2,
                                            "Could read the whole card with difficulty" = 3,
                                            "Could read the whole card with ease" = 4),
                               sell_freq = c("First time" = 1, "Once a year" = 2,
                                             "Once every few months" = 3,
                                             "Once a month" = 4, 
                                             "A few times a month (2-4 times)" = 5,
                                             "4-6 days a week" = 6,
                                             "1-3 days a week" = 7,
                                             "Every day" = 8),
                               customers_comp_lst_yr = c("Way fewer (less than 1/2" = 1,
                                                         "Fewer (less, but not 1/2)" = 2,
                                                         "Same"= 3, 
                                                         "More (more but not double)" = 4,
                                                         "Many more (more than double)" = 5),
                               profit_lst_yr_month = c("Much lower (less than 1/2" = 1,
                                                       "Lower (less, but not 1/2)" = 2,
                                                       "Same"= 3, 
                                                       "More (more but not double)" = 4,
                                                       "Much higher (more than double)" = 5),
                               profit_lst_yr_gen = c("My profits are much lower today" = 1,
                                                     "My profits are lower today" = 2,
                                                     "My profits are about the same" = 3,
                                                     "My profits are higher today" = 4,
                                                     "My profits are much higher today" = 5),
                               econ_cond_5 = c("Much Worse" = 1, "Worse" = 2,
                                               "Same" = 3, "Better" = 4,
                                               "Much Better" = 5),
                               econ_cond_3 = c(Worse = 1, Same = 2, Better = 3))


##### Outcome Variables

#creating new variables from old
vendor_end <- mutate(vendor_end, 
                     list_treat = ifelse(!is.na(d15_treatment) & 
                                             is.na(d15_control), 
                                           1, 
                                           ifelse(!is.na(d15_control) &
                                                    is.na(d15_treatment),
                                                  0, NA)),
                     list_outcome = ifelse(!is.na(d15_treatment) & 
                                           is.na(d15_control) &
                                             d15_treatment >= 0, 
                                         d15_treatment, 
                                         ifelse(!is.na(d15_control) &
                                                  is.na(d15_treatment) &
                                                  d15_control >= 0,
                                                d15_control, NA)),
                     order_indicator = ifelse(long_survey==1, 5,
                                              ifelse(!is.na(op1_tc1a), 1,
                                                     ifelse(!is.na(op2_tc1a), 2,
                                                            ifelse(!is.na(op2_tc1a), 3,
                                                                   ifelse(!is.na(op2_tc1a), 4, NA))))),
                     fee1_full = replace2(op1_tc1a, is.na(op1_tc1a), op2_tc1a) %>% 
                       replace2(., is.na(.), op3_tc1a) %>% 
                       replace2(., is.na(.), op4_tc1a),
                     fee1_full_0s = ifelse(fee1_full > 5 | fee1_full < 0, NA, fee1_full),
                     fee1_part = replace2(op1_tc1b, is.na(op1_tc1b), op2_tc1b) %>% 
                       replace2(., is.na(.), op3_tc1b) %>% 
                       replace2(., is.na(.), op4_tc1b),
                     fee1_part_0s = ifelse(fee1_part > 5 | fee1_part < 0, NA, fee1_part),
                     fee1_none = replace2(op1_tc1c, is.na(op1_tc1c), op2_tc1c) %>% 
                       replace2(., is.na(.), op3_tc1c) %>% 
                       replace2(., is.na(.), op4_tc1c),
                     fee1_none_0s = ifelse(fee1_none > 5 | fee1_full < 0, NA, fee1_none),
                     fee1_outcome_0s = fee1_full_0s + .5*fee1_part_0s,
                     fee2_always = replace2(op1_tc3a, is.na(op1_tc3a), op2_tc3a) %>% 
                       replace2(., is.na(.), op3_tc3a) %>% 
                       replace2(., is.na(.), op4_tc3a),
                     fee2_always_0s = ifelse(fee2_always > 10 | fee2_always < 0, 
                                          NA, fee2_always),
                     fee2_sometimes = replace2(op1_tc3b, is.na(op1_tc3b), op2_tc3b) %>% 
                       replace2(., is.na(.), op3_tc3b) %>% 
                       replace2(., is.na(.), op4_tc3b),
                     fee2_sometimes_0s = ifelse(fee2_sometimes > 10 | fee2_sometimes < 0,
                                             NA, fee2_sometimes),
                     fee2_never = replace2(op1_tc3c, is.na(op1_tc3c), op2_tc3c) %>% 
                       replace2(., is.na(.), op3_tc3c) %>% 
                       replace2(., is.na(.), op4_tc3c),
                     fee2_never_0s = ifelse(fee2_never > 10 | fee2_always < 0,
                                         NA, fee2_never),
                     fee1_0s = (fee1_full_0s == 0) & (fee1_part_0s == 0) & (fee1_none_0s == 0),
                     fee1_full = ifelse(!fee1_0s, fee1_full_0s, NA),
                     fee1_part = ifelse(!fee1_0s, fee1_part_0s, NA),
                     fee1_none = ifelse(!fee1_0s, fee1_none_0s, NA),
                     fee1_outcome = fee1_full + .5*fee1_part,
                     fee2_0s = (fee2_always_0s == 0) & (fee2_sometimes_0s == 0) & (fee2_never_0s == 0),
                     fee2_always = ifelse(!fee2_0s, fee2_always_0s, NA),
                     fee2_sometimes = ifelse(!fee2_0s, fee2_sometimes_0s, NA),
                     fee2_never = ifelse(!fee2_0s, fee2_never_0s, NA),
                     receipt_check = tc2,
                     receipt_date = tc2_date,
                     #note these are not final codings of recent_receipt_7, see
                     #end of script for changes to recent_receipt_* variables
                     #and receipt_date
                     recent_receipt_7 = ifelse(is.na(receipt_date), 0, 
                                               ifelse(abs(as.numeric(difftime(receipt_date, 
                                                                              starttime, 
                                                                              units = "days"))) <= 7, 1, 0)),
                     recent_receipt_10 = ifelse(is.na(receipt_date), 0, 
                                                ifelse(abs(as.numeric(difftime(receipt_date,
                                                                               starttime, 
                                                                               units = "days"))) <= 10, 1, 0)),
                    no_rcpt_when_pay = neg_to_na(tc2_16)
)

### Other Outcomes

## Causal Mechanism Variables
# clean and reverse
vendor_end <- vendor_end %>% mutate(#Bottom-Up
                     tr1_clean = rev_values(neg_to_na(tr1)),
                     tr2_clean = rev_values(neg_to_na(tr2)),
                     tr9e_clean = rev_values(neg_to_na(tr9e)),
                     tr9g_clean = rev_values(neg_to_na(tr9g)),
                     tr9h_clean = rev_values(neg_to_na(tr9h)),
                     ms1_clean = rev_values(neg_to_na(ms1)),
                     ms3_clean = rev_values(neg_to_na(ms3)),
                     ms4_clean = rev_values(neg_to_na(ms4)),
                     ms5_clean = rev_values(neg_to_na(ms5)),
                     ms6_clean = rev_values(neg_to_na(ms6)),
                     ms10_clean = rev_values(neg_to_na(ms10)),
                     tc2_10_clean = neg_to_na(tc2_10),
                     tc2_4b_clean = rev_values(neg_to_na(tc2_4b)),
                     tc2_29_clean = replace2(op1_tc2_29_a, is.na(op1_tc2_29_a),
                                             rev_values(op1_tc2_29_b)) %>% 
                       replace2(., is.na(.), op2_tc2_29_a) %>%
                       replace2(., is.na(.), rev_values(op2_tc2_29_b)) %>% 
                       replace2(., is.na(.), op3_tc2_29_a) %>%
                       replace2(., is.na(.), rev_values(op3_tc2_29_b)) %>% 
                       replace2(., is.na(.), op4_tc2_29_a) %>%
                       replace2(., is.na(.), rev_values(op4_tc2_29_b)),
                     pay_even_disagree = ifelse(tc2_29_clean == 1, 1, 0),
                     tc2_15_impact_clean = neg_to_na(tc2_15_impact), #must be run as multinomial logit
                     #Top-Down
                     tc5a_clean = rev_values(neg_to_na(tc5a)),
                     tc5b_clean = rev_values(neg_to_na(tc5b)),
                     tc2_15b_clean = rev_values(neg_to_na(tc2_15b)),
                     tc9_clean = neg_to_na(tc9)
                     )

## Indirect Effects Variables
# clean and reverse
vendor_end <- vendor_end %>%
  mutate(election3_clean = neg_to_na(election3),
         vote_intend = ifelse(election3_clean == 1, 1, 0),
         petition_anonymous = neg_to_na(behavioral),
         petition_anonymous = ifelse(petition_anonymous == 1, 1, 0),
         petition_name = neg_to_na(b1),
         petition_name = ifelse(petition_name == 1, 1, 0))

# relabel
#removing NA values for factor purposes
vendor_end <- vendor_end %>% remove_NA_labels(na_vec, start = 614)

trust_label <- c("Not at all trustworthy" = 1, "Not very trustworthy" = 2, 
                 "Somewhat trustworthy" = 3, "Very trustworthy" = 4)
agree_label <- c("Strongly Disagree" = 1, "Somewhat Disagree" = 2,
                 "Somewhat Agree" = 3, "Strongly Agree" = 4)

satisfied_label <- c("Very Dissatisfied" = 1, "Somewhat Dissatisfied" = 2,
                     "Somewhat Satisfied" = 3, "Very Satisfied" = 4)

often_label <- c("It is extremely rare -- I have never seen it" = 1,
                 "It is not common" = 2,
                 "It happens" = 3,
                 "It happens a lot" = 4,
                 "Always" = 5)

# Rearrange labels for those vars need it
val_labels(vendor_end) <- list(tr1_clean = trust_label,
                               tr2_clean = trust_label,
                               tr9e_clean = agree_label,
                               tr9g_clean = agree_label,
                               tr9h_clean = agree_label,
                               ms1_clean = satisfied_label,
                               ms3_clean = satisfied_label,
                               ms4_clean = satisfied_label,
                               ms5_clean = satisfied_label,
                               ms6_clean = satisfied_label,
                               ms10_clean = satisfied_label,
                               tc2_4b_clean = agree_label,
                               tc5a_clean = agree_label,
                               tc5b_clean = agree_label,
                               tc2_15b_clean = agree_label,
                               tc2_29_clean = c("Vendors should always pay tax even if they disagree with local government." = 1,
                                          "Vendors should only pay tax if they agree with local government." = 2),
                               no_rcpt_when_pay = often_label)

## Spillover Questions

#clean and reverse
vendor_end <- vendor_end %>% 
  mutate(sp1_clean = rev_values(neg_to_na(sp1)),
         sp2_there_more_customers = sp2_1,
         sp2_there_better_dev = sp2_2,
         sp2_there_easier_not_pay = sp2_3,
         sp2_for_comparison = sp2_4,
         sp2_here_fee_too_high = sp2_5,
         sp2_here_tax_col_too_agg = sp2_6,
         sp3_clean = rev_values(neg_to_na(sp3)),
         sp4_here_more_customers = sp4_1,
         sp4_here_better_dev = sp4_2,
         sp4_here_easier_not_pay = sp4_3,
         sp4_for_comparison = sp4_4,
         sp5_clean = neg_to_na(sp5),
         sell_in_othr_mkts = ifelse(sp5_clean == 1, 1, 0),
         sp7_close_tog = sp7_1,
         sp7_open_diff_days = sp7_2,
         sp7_good_dev = sp7_3,
         sp7_more_customers = sp7_4,
         sp7_how_mkt_managed = sp7_5,
         sp7_fees_low = sp7_6,
         sp8_clean = rev_values(neg_to_na(sp8)) - 1,
         sp8_clean = ifelse(sp8_clean == 7, 0, sp8_clean),
         compl_surv_othr_mkt = neg_to_na(final1),
         final2_clean = neg_to_na(final2),
         compl_surv_since_Sep18 = neg_to_na(final2b),
         date_othr_surv = final2a)

# relabel
#removing NA values for factor purposes
vendor_end <- vendor_end %>% remove_NA_labels(na_vec, start = 640, end = 663)

sp1_label <- c("No, I don't know any vendors who have done this" = 1, 
               "Yes, but I don't know how many" = 2,
               "Yes, I know 1-5 vendors who have done this" = 3,
               "Yes, I know 5-10 vendors who have done this" = 4, 
               "Yes, I know at least 10 vendors who have done this" = 5)

freq2_label <- c("Never" = 0, "Once a year" = 1, "Once every few months" = 2,
                 "Once a month" = 3, "A few times a month" = 4,
                 "Every Week" = 5, "Every Day" = 6)

val_labels(vendor_end) <- list(sp1_clean = sp1_label,
                               sp3_clean = sp1_label,
                               sp8_clean = freq2_label)


## Compliance Questions
vendor_end <- vendor_end %>% 
  mutate(active_committee = ifelse(neg_to_na(s5) == 1, 1, 0),
         committee_elected = ifelse(neg_to_na(s8a) == 1, 1, 0),
         attend_com_meetings = ifelse(neg_to_na(s6) == 1, 1, 0),
         commitee_appointed = ifelse(neg_to_na(s8a) == 2, 1, 0),
         mkt_com_strong_advocate = rev_values(neg_to_na(s8b)), #agree label
         mkt_com_training = ifelse(neg_to_na(s9j) == 1, 1, 0),
         mkt_com_training_mkt_mang = s9d_1,
         mkt_com_training_wc = s9d_2,
         mkt_com_training_dg = s9d_3,
         mkt_com_training_mp = s9d_4,
         mkt_com_training_LGAP = s9d_5,
         mkt_com_training_mkt_com = s9d_6,
         mkt_com_training_chief = s9d_7,
         meeting_held = ifelse(neg_to_na(te1) == 1, 1, 0),
         meeting_attended = ifelse(neg_to_na(te2) == 1, 1, 0),
         meeting_disc_dev = te3_1,
         meeting_disc_pay_fees = te3_2,
         meeting_disc_corrup = te3_3,
         meeting_disc_use_fees = te3_4,
         meeting_disc_sell_outside = te3_5,
         meeting_disc_com_elec = te3_6,
         meeting_disc_how_info_rev = te3_7,
         meeting_disc_how_rpt_issue = te3_8,
         meeting_disc_const = te3_9,
         ms_disc_toilets = te3_ms_1,
         ms_disc_water = te3_ms_2,
         ms_disc_security = te3_ms_3,
         ms_disc_lights = te3_ms_4,
         ms_disc_roofs = te3_ms_5,
         ms_disc_stalls = te3_ms_6,
         ms_disc_paths = te3_ms_7,
         ms_disc_trash = te3_ms_8,
         ms_disc_electricity = te3_ms_9,
         meeting_pick_proj = ifelse(neg_to_na(te3a) == 1, 1, 0),
         meeting_proj_chosen = neg_to_na(te3b), #te3_ms label
         meeting_proj_received = neg_to_na(te3c), #te3_ms label
         mkt_chos_best_proj = rev_values(neg_to_na(te3d)),#agree label
         proj_benef_vend = rev_values(neg_to_na(te3e)), #agree label
         proj_show_dg_listens = rev_values(neg_to_na(te3f)), #agree label
         any_constr = ifelse(neg_to_na(te6) == 1, 1, 0),
         constr_toilets = te7_1,
         constr_water = te7_2,
         constr_security = te7_3,
         constr_lights = te7_4,
         constr_roofs = te7_5,
         constr_stalls = te7_6,
         constr_paths = te7_7,
         constr_trash = te7_8,
         constr_electricity = te7_9,
         who_resp_constr = neg_to_na(te8), #te4 label
         who_fund_constr = neg_to_na(te8_funded), #te4 label
         rev_info_system = rev_values(neg_to_na(te13)), #te13 label (- 1 when num)
         how_acc_info = rev_values(neg_to_na(te15)), #accurate label
         info_source_sms = te14_1,
         info_source_poster = te14_2,
         info_source_told = te14_3,
         info_source_mkt_com = te14_4,
         info_source_mkt_mang_fc = te14_5,
         how_freq_sms = rev_values(neg_to_na(te14a)), #sms_freq label
         received_sms = ifelse(how_freq_sms > 1, 1, 0),
         exist_sms_compl_sys = ifelse(neg_to_na(te19a) == 1, 1, 0),
         used_sms_compl_sys = ifelse(neg_to_na(te19ab) == 1, 1, 0),
         how_likely_send_sms = rev_values(neg_to_na(te14d)), #likely label
         familiar_LGAP = ifelse(neg_to_na(te22) == 1, 1, 0),
         opinion_LGAP = rev_values(neg_to_na(te24)) #positive_negative_5 label (-1 when num)
         )

# relabel
#removing NA values for factor purposes
vendor_end <- vendor_end %>% remove_NA_labels(na_vec, start = 666, end = 730)

te3_ms_label1 <- c("Toilets" = 1, "Water taps" = 2, "Security" = 3,
                   "Lights" = 4, "Roofs" = 5, "Stalls" = 6, "Paths" = 7,
                   "Trash skip" = 8, "Electricity" = 9)
te3_ms_label2 <- c("No project happened" = 0, te3_ms_label1)

te4_label <- c("Market Manager" = 1,
              "Ward Councilor" = 2,
              "Other District Government" = 3,
              "Member of parliament" = 4,
              "USAID/DAI/LGAP "= 5, 
              "Market Vendors/Market Committee" = 6,
              "Traditional leaders/chiefs" = 7)

te13_label <- c("No" = 1,
               "Yes - can find out how spent only" = 2,
               "Yes - can find out amount collected only" = 3,
               "Yes - find out both amount collect and how it was spent" = 4)  

accurate_label <- c("Very inaccurate" = 1, 
                    "Somewhat inaccurate" = 2,
                    "Somewhat accurate" = 3, 
                    "Very accurate" = 4)

sms_freq_label <- c("Never" = 1,
                   "Once" = 2,
                   "Every other month (about 3 times)" = 3,
                   "About once a month (about 6 times)" = 4,
                   "More than once a month (more than 6 times)" = 5)

likely_label <- c("Very Unlikely" = 1,
                  "Somewhat Unlikely" = 2, 
                  "Somewhat Likely" = 3,
                  "Very Likely "= 4)

pos_neg_label <- c("I haven't heard enough to say" = 1,
                   "Very negative" = 2, 
                   "Somewhat negative" = 3,
                   "Somewhat positive" = 4,
                   "Very positive" = 5)

val_labels(vendor_end) <- list(meeting_proj_chosen = te3_ms_label1,
                               meeting_proj_received = te3_ms_label2,
                               mkt_chos_best_proj = agree_label,
                               mkt_com_strong_advocate = agree_label,
                               proj_benef_vend = agree_label,
                               proj_show_dg_listens = agree_label,
                               who_resp_constr = te4_label,
                               who_fund_constr = te4_label,
                               rev_info_system = te13_label,
                               how_acc_info = accurate_label,
                               how_freq_sms = sms_freq_label,
                               how_likely_send_sms = likely_label,
                               opinion_LGAP = pos_neg_label)


# Post-Cleaning Tasks ----------------------------------------------------------

#convert labelled integers to factors
vendor_end <- mutate_if(vendor_end, is.labelled, to_factor)

#merge in treatment assignment
vendor_end <- left_join(vendor_end, 
                        read_csv('data/2_clean/treatment_groups.csv'),
                        by = c("market" = "Market", "district" = "District"))

#create BU and TD treatment indicators
vendor_end <- mutate(vendor_end,
                     BU_treat = str_detect(treatment_status, "BU|BOTH") %>%
                       as.numeric(),
                     TD_treat = str_detect(treatment_status, "TD|BOTH") %>%
                       as.numeric(),
                     BU = ifelse(BU_treat == 1 & TD_treat == 0,
                                 1, 0),
                     TD = ifelse(BU_treat == 0 & TD_treat == 1,
                                 1, 0),
                     Both = ifelse(BU_treat == 1 & TD_treat == 1,
                                   1, 0))

#merge in treatment assignment block ids
vendor_end <- left_join(vendor_end, 
                        read_dta("data/1_raw/block_ids.dta") %>% 
                          mutate_if(is.labelled, to_factor),
                        by = c("market" = "market", "district" = "district"))

# Behavioral Outcomes
vendor_end <- vendor_end %>% 
  mutate(behavioral_cl = factor(behavioral, levels = c("No", "Yes")),
         petition = if_else(behavioral_cl == "Yes", 1, 0),
         b1_cl = factor(b1, levels = c("No", "Yes")),
         petition_wname = if_else(b1_cl == "Yes", 1, 0),
         kw600 = if_else(b2a == "600 Kw in one week", 1, 0),
         stmt1_first = if_else(b6_order == 1, 1, 0),
         stmt1_msg = coalesce(b6_s1_first, b6_s1_last),
         stmt1_agree = if_else(stmt1_msg != "No", 1, 0),
         stmt1_agree_sent = if_else(stmt1_msg == "Yes, only agree" |
                                      stmt1_msg == "Yes, with message", 1, 0),
         stmt1_msg_txt = coalesce(b6_s1_msg_first, b6_s1_msg_last),
         stmt1_no_msg_why = coalesce(b6_s1_no_msg_first, b6_s1_no_msg_last),
         stmt2_msg = b6_s2,
         stmt2_msg_txt = b6_s2_msg,
         stmt2_no_msg_why = b6_s2_no_msg,
         stmt2_agree = if_else(stmt2_msg != "No", 1, 0),
         stmt2_agree_sent = if_else(stmt2_msg == "Yes, only agree" |
                                      stmt2_msg == "Yes, with message", 1, 0))

# Post-Post-Cleaning Tasks  ----------------------------------------------------
#Turn all Fac. OVs into Num.
vendor_end <- vendor_end %>% 
  mutate(tr1_num = as.numeric(tr1_clean),
         tr2_num = as.numeric(tr2_clean),
         tr9e_num = as.numeric(tr9e_clean),
         tr9g_num = as.numeric(tr9g_clean),
         tr9h_num = as.numeric(tr9h_clean),
         ms1_num = as.numeric(ms1_clean),
         ms4_num = as.numeric(ms4_clean),
         ms3_num = as.numeric(ms3_clean),
         ms5_num = as.numeric(ms5_clean),
         ms6_num = as.numeric(ms6_clean),
         ms10_num = as.numeric(ms10_clean),
         tc2_4b_num = as.numeric(tc2_4b_clean),
         tc5a_num = as.numeric(tc5a_clean),
         tc5b_num = as.numeric(tc5b_clean),
         tc2_29_num = as.numeric(tc2_29_clean),
         tc2_15b_num = as.numeric(tc2_15b_clean),
         ms_average = ms1_num + ms3_num + ms4_num +
           ms5_num + ms6_num,
         ms_average = ms_average/5,
         satisfaction_dev = ms10_clean,
         satisfaction_dev_num = as.numeric(satisfaction_dev),
         enum = as.character(enum),
         sup = as.character(sup),
         no_rcpt_when_pay_num = as.numeric(no_rcpt_when_pay),
         mkt_chos_best_proj_num = as.numeric(mkt_chos_best_proj),
         proj_benef_vend_num = as.numeric(proj_benef_vend),
         proj_show_dg_listens_num = as.numeric(proj_show_dg_listens),
         how_acc_info_num = as.numeric(how_acc_info),
         rev_info_system_dich = ifelse(rev_info_system == "No", 0, 1),
         how_freq_sms_num = as.numeric(how_freq_sms),
         how_likely_send_sms_num = as.numeric(how_likely_send_sms),
         opinion_LGAP_num = as.numeric(opinion_LGAP),
         opinion_LGAP_num2 = ifelse(opinion_LGAP_num == 1,
                                    NA,
                                    opinion_LGAP_num - 1),
         mkt_com_strong_advocate_num = as.numeric(mkt_com_strong_advocate)
         )

# Creating new binary versions of spillover questions
vendor_end <- vendor_end %>% 
  mutate(sp1_yn = ifelse(is.na(sp1_clean), NA, 
                         ifelse(grepl("Yes", sp1_clean), 1, 0)),
         sp3_yn = ifelse(is.na(sp3_clean), NA,
                         ifelse(grepl("Yes", sp3_clean), 1, 0)))

#fixing issues with yrs in mkt variable
vendor_end$yrs_in_mkt_fix <- ifelse(vendor_end$yrs_in_mkt > 100,
                                     NA, 
                                     vendor_end$yrs_in_mkt)

#making education numeric
vendor_end$educ_num <- as.numeric(vendor_end$education) - 1
vendor_end$educ_none <- 1*(vendor_end$educ_num == 0)
vendor_end <- vendor_end %>% 
  mutate(educ_cat = ifelse(educ_num > 0 & educ_num <= 9, 1,
                           ifelse(educ_num > 9 & educ_num <= 13, 2,
                                  ifelse(educ_num > 13, 3, educ_num))))

#make receipt_age variable, receipt official variable (if had receipt and it 
#looked official), fixing receipt variable
vendor_end <- vendor_end %>% 
  mutate(receipt_age_orig = as.numeric(difftime(strptime(starttime, 
                                                         format = "%Y-%m-%d",
                                                         tz = "UTC"),
                                                receipt_date,
                                                units = "days")), 
         #this is the receipt age as determined purely by survey data
         #the more positive, the older the receipt; negative values are "impossible" values
         #fixing receipt age in terms of years turn all years for impossible values into 2018 (likely enum error)
         #except for Jenda, which was visited in 2019
         receipt_date_orig = receipt_date, #save old receipt_date
         receipt_yr = year(receipt_date),
         receipt_month = month(receipt_date),
         receipt_day = day(receipt_date),
         receipt_year = if_else(receipt_yr > 2018 & market != "Jenda Market",
                                2018, receipt_yr),
         receipt_yr = NULL)

#fixing receipt date
vendor_end <- vendor_end |> 
  mutate(receipt_date = if_else(!is.na(receipt_date_orig),
                                make_date(year = receipt_year,
                                          month = receipt_month,
                                          day = receipt_day),
                                NA))


#further fixes
vendor_end <- vendor_end %>% 
  mutate(receipt_age = as.numeric(difftime(strptime(starttime, 
                                                    format = "%Y-%m-%d", 
                                                    tz = "UTC"),
                                           receipt_date,
                                           units = "days")),
         #now correct -2 and -3 values, which arose due to faulty tablets
         receipt_age = ifelse(receipt_age == -2 | receipt_age == -3, 0,
                              receipt_age),
         #retain old recent_receipt vars
         recent_receipt_7_v2a = recent_receipt_7,
         recent_receipt_10_v2a = recent_receipt_10,
         #make recent_receipt_* vars drop nonsensical values
         recent_receipt_7 = ifelse(is.na(receipt_date), 0, 
                                   ifelse(receipt_age <= 7 &
                                            receipt_age >= 0, 1, 0)),
         recent_receipt_7 = ifelse(is.na(receipt_date), recent_receipt_7, 
                                   ifelse(receipt_age < 0,
                                          NA, recent_receipt_7)),
         recent_receipt_10 = ifelse(is.na(receipt_date), 0, 
                                    ifelse(receipt_age <= 10 &
                                             receipt_age >= 0, 1, 0)),
         recent_receipt_10 = ifelse(is.na(receipt_date), recent_receipt_10, 
                                    ifelse(receipt_age < 0,
                                           NA, recent_receipt_10)),
         recent_receipt_7_ofcl = ifelse(is.na(recent_receipt_7), NA,
                                        ifelse(recent_receipt_7 == 1 &
                                                 tc2a == "Yes", 1, 0)),
         recent_receipt_10_ofcl = ifelse(is.na(recent_receipt_10), NA,
                                         ifelse(recent_receipt_10 == 1 &
                                                  tc2a == "Yes", 1, 0)),
         #make version of recent receipt where we count "impossible" values as 0s
         recent_receipt_7_v2b = ifelse(is.na(receipt_date), 0, 
                                       ifelse(receipt_age <= 7 &
                                                receipt_age >= 0, 1, 0)),
         recent_receipt_10_v2b = ifelse(is.na(receipt_date), 0, 
                                        ifelse(receipt_age <= 10 &
                                                 receipt_age >= 0, 1, 0)),
         #make receipt_check 1/0
         receipt_shown = ifelse(is.na(receipt_check), NA,
                                as.numeric(receipt_check == "Receipt Available"))
  )

#making binary versions of some intermediate outcome vars for add. analysis
#requested by USAID
vendor_end <- vendor_end %>% 
  mutate(tr1_bin = ifelse(is.na(tr1_num), NA,
                          ifelse(tr1_num >= 3, 1, 0)),
         tr2_bin = ifelse(is.na(tr2_num), NA,
                          ifelse(tr2_num >= 3, 1, 0)),
         ms10_bin = ifelse(is.na(ms10_num), NA,
                          ifelse(ms10_num >= 3, 1, 0)),
         ms1_bin = ifelse(is.na(ms1_num), NA,
                          ifelse(ms1_num >= 3, 1, 0)),
         tc2_4b_bin = ifelse(is.na(tc2_4b_num), NA,
                          ifelse(tc2_4b_num >= 3, 1, 0)),
         tc2_15b_bin = ifelse(is.na(tc2_15b_num), NA,
                          ifelse(tc2_15b_num >= 3, 1, 0)))

## Add'l Receipts Variables
vendor_end <- vendor_end |> 
  mutate(got_rcpt_last_time = if_else(tc7 == "Yes, just for me",
                                      1,
                                      0),
         typ_rcpt_keep_frvr = if_else(tc7_b == "I keep it forever/never dispose of it",
                                      1,
                                      0),
         typ_rcpt_keep_days_mult = case_when(
           tc7_b == "Hours" ~ 1/24, # hours
           tc7_b == "Days" ~ 1, # days,
           tc7_b == "Weeks" ~ 7, # weeks
           tc7_b == "Months" ~ 30, # months
           tc7_b == "I keep it forever/never dispose of it" ~ NA, # never dispose of it
         ),
         typ_rcpt_keep_days = neg_to_na(tc7_a * typ_rcpt_keep_days_mult)
  )

# Saving -----------------------------------------------------------------------
#save as RData file
save(vendor_end, file = "data/2_clean/vendor_end.RData")
